Goal: Capstone Dashboard
Tahapan: - Preprocessing - Cleansing - Perubahan tipe data - Feature Engineering - Aggregasi data - Buat Tooltip - Visualisasi Data - Buat visualisasi interaktif - Web dashboard
Kita akan coba mengeksplorasi kembali data Pakistan Largest Ecommerce Dataset menggunakan package dplyr! Mari kita baca terlebih dahulu data kita:
# read data
data <- read.csv("Pakistan Largest Ecommerce Dataset.csv", stringsAsFactors = TRUE, encoding = "latin1")
# cleansing data
data_clean <- data %>%
select(
#membersihkan yang tidak perlu
-c(increment_id, sales_commission_code, Working.Date, BI.Status,MV, Year, Month, Customer.Since, M.Y, FY, X, X.1, X.2, X.3, X.4)) %>%
mutate(
#ngubah tipe data
created_at= dmy(created_at),
sku= as.character(sku),
#nambah kolom
month= month(created_at, label = FALSE, abbr = TRUE),
year= year(created_at)
) %>%
#gantii nama kolom
rename(
c(product_name = sku, category_name = category_name_1)) %>%
filter(
#filter hanya menggunakan tahun 2017
year(created_at) %in% c(2017),
category_name != "\\N",
grepl("_", product_name) # filter nama produk yang tidak adaa "_" atau kelihatan seperti barcode
)
data_clean#> item_id status created_at product_name price
#> 0 0 0 0 0
#> qty_ordered grand_total category_name discount_amount payment_method
#> 0 0 0 0 0
#> Customer.ID month year
#> 0 0 0
Poin captone: - ketepatan kita menggunakan plot dalam membandingkan data yang ingin kitaa representasikan
#Page 1(Overview) ##PLOT 1: (line plot) “Monthly Sales Trend in Pakistan for 2017”
# Data Wrangling
sales_trend <- data_clean %>%
group_by(month) %>%
summarise(total_sales = sum(grand_total)) %>%
ungroup() %>%
arrange(month)
sales_trend# nambah kolom month singkatan & full month ~> utk x axis di plot & tooltip
sales_trend <- sales_trend %>%
mutate(
# kolom singkatan utk sumbu x
month_name = factor(month, levels = 1:12, labels = month.abb),
# kolom full month utk tooltip
full_month_name = factor(month, levels = 1:12, labels = month.name)
)
sales_trend#buat tooltip desc
sales_trend <- sales_trend %>%
mutate(
label = glue(
"Total Sales: {number(total_sales, big.mark = '.', decimal.mark =',', accuracy = 1)}
Month: {full_month_name}"
)
)
sales_trend# Visualization ~> dibuat ggplot nyaa
#"Monthly Sales Trend in Pakistan for 2017"
plot1 <- ggplot(sales_trend, aes(x=month_name, y= total_sales))+
geom_line(col="darkgreen", group=1) +
geom_point(aes(text=label), col="black") +
scale_y_continuous(labels = label_number(big.mark = ".", decimal.mark = ","),
breaks = seq(0, 100000000, 10000000)) +
labs(
title = "Monthly Sales Trend",
x = NULL,
y = "Total Sales"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5)
)
ggplotly(plot1, tooltip = "text")This shows the eCommerce sales growth throughout the year.
Total_Sales<- data_clean %>% summarise(Total_sales=number(sum(grand_total), big.mark = '.', decimal.mark =',', accuracy = 1))
Total_SalesTotal_product<- data_clean %>% summarise(total_product=number(sum(qty_ordered), big.mark = '.', decimal.mark =',', accuracy = 1))
Total_productTotal_customer<- data_clean %>%
summarise(total_cust=number(length(unique(Customer.ID)), big.mark = '.', decimal.mark =',', accuracy = 1))
Total_customer##PLOT 2: (lolipop plot) “Total Orders by Payment Method in Pakistan”
# Data Wrangling
fav_payment<- data_clean %>%
group_by(payment_method) %>%
summarise(Payment_type = n()) %>%
ungroup() %>%
arrange(-Payment_type)
fav_paymentfav_payment <- fav_payment %>%
mutate(
label = glue(
"Payment Method: {payment_method}
Total Orders: {number(Payment_type, big.mark = '.', decimal.mark =',', accuracy = 1)}"
)
)
fav_payment# Cara 1: INI PAKE LOLIPOP PLOT, efektif
# Visualization ~> dibuat ggplot nyaa
#"Fav Payment Method"
plot2 <- ggplot(fav_payment, aes(x = Payment_type,
y = reorder(payment_method, Payment_type),
text = label)) +
geom_segment(aes(x= 0, xend=Payment_type, yend=reorder(payment_method, Payment_type)), color="darkgreen", size= 1) +
geom_point(color="black", size=3) +
scale_x_continuous(labels = comma) +
labs(title = "Top Payment Method",
x = "Total Orders",
y = NULL) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5)
)
ggplotly(plot2, tooltip = "text")# Cara 2: INI PAKE BAR PLOT, tp susah utk buka tooltip rank paling rendah krn qty dikit jd barnya jg tipis
# Visualization ~> dibuat ggplot nyaa
#"Fav Payment Method"
plot2 <- ggplot(fav_payment, aes(x = Payment_type,
y = reorder(payment_method, Payment_type),
text = label)) +
geom_col(aes(fill=Payment_type)) +
scale_x_continuous(labels = comma)+
scale_fill_gradient(low="black", high="darkgreen") +
labs(title = "Top Payment Method in Pakistan 2017",
x = "Total Orders",
y = NULL) +
theme_minimal()+
theme(legend.position = "none")
ggplotly(plot2, tooltip = "text")This identifies the most popular payment methods in the country.
#Page 2(Sales Analysis) ## PLOT 3: (line plot) “Monthly total sales in each category”
# Data Wrangling
category_sales_trend <- data_clean %>%
filter(category_name =="Beauty & Grooming") %>%
group_by(month) %>%
summarise(total_sales_category = sum(grand_total)) %>%
ungroup() %>%
arrange(month)
category_sales_trend# nambah kolom month singkatan & full month ~> utk x axis di plot & tooltip
category_sales_trend <- category_sales_trend %>%
mutate(
# kolom singkatan utk sumbu x
month_name = factor(month, levels = 1:12, labels = month.abb),
# kolom full month utk tooltip
full_month_name = factor(month, levels = 1:12, labels = month.name)
)
category_sales_trend#buat tooltip desc
category_sales_trend <- category_sales_trend %>%
mutate(
label = glue(
"Total Sales: {number(total_sales_category, big.mark = '.', decimal.mark =',', accuracy = 1)}
Month: {full_month_name}"
)
)
category_sales_trend# CARA 1: INI DGN LINE PLOT, GAMPANG DIBACA
# Visualization ~> dibuat ggplot nyaa
#"Monthly total sales in each category"
plot3 <- ggplot(category_sales_trend, aes(x=month_name, y= total_sales_category))+
geom_line(col="darkgreen", group=1) +
geom_point(aes(text=label), col="black") +
scale_y_continuous(labels = label_number(big.mark = '.', decimal.mark =','),
breaks = seq(0, 100000000, 200000)) +
labs(
title = "Monthly Sales for Beauty & Grooming",
x = NULL,
y = "Total Sales"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5)
)
ggplotly(plot3, tooltip = "text")# CARA 2: INI DGN LOLIPOP PLOT
# Visualization ~> dibuat ggplot nyaa
#"Monthly total sales in each category"
plot3 <- ggplot(category_sales_trend, aes(x=month_name, y= total_sales_category, text=label)) +
geom_segment(aes(xend=month_name, y=0,yend=total_sales_category), color="darkgreen", size= 1) +
geom_point(color="black", size=3) +
scale_y_continuous(labels = label_number(, big.mark = '.', decimal.mark =','),
breaks = seq(0, 100000000, 200000)) +
labs(title = "Monthly Total Sales on Beauty & Grooming in Pakistan 2017",
x = NULL,
y = "Total Sales") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5)
)
ggplotly(plot3, tooltip = "text")This gives a view of which month was has the most sales in Pakistan.
##PLOT 4: (bar plot) “Top 10 Best Selling Product in each category”
# Data Wrangling
fav_product <- data_clean %>%
filter(category_name =="Mobiles & Tablets") %>%
group_by(product_name) %>%
summarise(total_qty = sum(qty_ordered), total_sold= sum(grand_total)) %>%
ungroup() %>%
arrange(-total_qty) %>%
head(10)
fav_productprocess_product_name <- function(product_name) {
parts <- str_split(product_name, " - ") %>% unlist() # Split by ' - '
if (length(parts) > 2) {
# Combine the first and last part (brand name & color)
return(paste(parts[1], parts[length(parts)], sep = " - "))
} else {
# If there are not 3 parts, return the product name as is
return(product_name)
}
}library(stringr)
fav_product <- fav_product %>%
mutate(short_name = sapply(product_name, process_product_name)) %>%
mutate(
label = glue(
"Product : {product_name}
Total Sales : {number(total_sold, big.mark='.', decimal.mark=',', accuracy = 1)}
Quantity : {number(as.numeric(total_qty), big.mark='.', decimal.mark=',', accuracy = 1)}"
)
)
fav_product# Visualization ~> dibuat ggplot nyaa
# "Top 10 Best Selling Product in each category"
plot4 <- ggplot(fav_product, aes(x = total_sold,
y = reorder(short_name, total_sold),
text = label)) +
geom_col(aes(fill = total_sold)) +
scale_fill_gradient(low="black", high="darkgreen") +
labs(title = "Top Mobiles & Tablets Products",
x = "Total Sales",
y = NULL) +
scale_x_continuous(labels = label_number(big.mark = '.', decimal.mark =',')) +
theme_minimal() +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
ggplotly(plot4, tooltip = "text")This shows the most popular product by the number of items sold & contributed to the highest sales in Pakistan.
##PLOT 5: (bar plot) “Quantity Product in each category Ordered over the year 2017”
# Data Wrangling
rank_qty_pcategory <- data_clean %>%
filter(created_at >= as.Date("2017-01-01") & created_at <= as.Date("2017-01-06")) %>%
group_by(category_name) %>%
summarise(qty_pcategory = sum(qty_ordered)) %>%
ungroup() %>%
arrange(-qty_pcategory)
rank_qty_pcategory#buat desc tooltip
rank_qty_pcategory <- rank_qty_pcategory %>%
mutate(label = glue(
"Category : {category_name}
Quantity : {number(qty_pcategory, big.mark = '.', decimal.mark =',', accuracy = 1)}"
)
)
rank_qty_pcategory# Visualization ~> dibuat ggplot nyaa
# Quantity Product in each category Ordered over the year 2017"
plot5 <- ggplot(rank_qty_pcategory, aes(x = qty_pcategory,
y = reorder(
category_name, qty_pcategory),
text = label)) +
geom_col(aes(fill = qty_pcategory)) +
scale_fill_gradient(low="black", high="darkgreen") +
labs(title = "Top Selling Categories",
x = "Quantity",
y = NULL) +
scale_x_continuous(labels = label_number(big.mark = '.', decimal.mark =',')) +
theme_minimal() +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
ggplotly(plot5, tooltip = "text")This shows the most popular categories based on the qty from a certain range of date.
##PLOT 6: (scatter plot) “Relationship between price and quantity from a certain range of date”
# Data Wrangling
rank_qty_pprice <- data_clean %>%
filter(created_at >= as.Date("2017-01-01") & created_at <= as.Date("2017-12-31")) %>%
group_by(price) %>%
summarise(qty_pprice = sum(qty_ordered)) %>%
ungroup() %>%
arrange(-qty_pprice)
rank_qty_pprice#buat desc tooltip
rank_qty_pprice <- rank_qty_pprice %>%
mutate(label = glue(
"Price : {number(price, big.mark = '.', decimal.mark =',', accuracy = 1)}
Quantity : {number(qty_pprice, big.mark = '.', decimal.mark =',', accuracy = 1)}"
)
)
rank_qty_pprice# Visualization ~> dibuat ggplot nyaa
# Quantity Product in each category Ordered over the year 2017"
plot6 <- ggplot(rank_qty_pprice, aes(x = qty_pprice,
y = price,
text = label)) +
geom_jitter(color="darkgreen") +
labs(title = "Relationship Between Price and Quantity Sold Products",
x = "Quantity",
y = "Price") +
scale_x_continuous(labels = label_number(big.mark = '.', decimal.mark =',')) +
scale_y_continuous(labels = label_number(big.mark = '.', decimal.mark =','))+
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5)
)
ggplotly(plot6, tooltip = "text")Shows the relationship between price and quantity. The higher the price, the less it is sold, and the higher it was sold, the lower the price are.
##PLOT 7: (line plot) “Sales/ Trend over date with many categories(top 8)”
# Data Wrangling
rank_sales_cat <- data_clean %>%
filter(created_at >= as.Date("2017-01-01") & created_at <= as.Date("2017-01-06")) %>%
group_by(category_name) %>%
summarise(sales = sum(grand_total)) %>%
ungroup() %>%
arrange(-sales)
rank_sales_catrank_sales_cat <- rank_sales_cat %>%
mutate(
label = glue(
"Category: {category_name}
Total Sales: {number(sales, big.mark = '.', decimal.mark =',', accuracy = 1)}"
)
)
rank_sales_cat# Visualization ~> dibuat ggplot nyaa
#"Top Categories by Sales"
plot7 <- ggplot(rank_sales_cat, aes(x = sales,
y = reorder(category_name, sales),
text = label)) +
geom_segment(aes(x= 0, xend=sales, yend=reorder(category_name, sales)), color="darkgreen", size= 1) +
geom_point(color="black", size=3) +
scale_x_continuous(labels = label_number(big.mark = '.', decimal.mark =',')) +
labs(title = "Top Category Sales",
x = "Total Sales",
y = NULL) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5)
)
ggplotly(plot7, tooltip = "text")This ranked the highest sales categories from a certain range of date.
#DONE